{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YUssqHFr0VM8",
        "outputId": "47dcb9c2-b276-43f3-9263-ef9c7fcdf45a"
      },
      "outputs": [],
      "source": [
        "!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CLw5KFxzz-vw"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "from sklearn.utils import resample\n",
        "from sklearn import preprocessing\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.model_selection import train_test_split\n",
        "from warnings import filterwarnings\n",
        "filterwarnings('ignore')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 379
        },
        "id": "llEWd-dM0ZQg",
        "outputId": "c8c3348b-d68c-4f7d-c587-5279e4b867b7"
      },
      "outputs": [],
      "source": [
        "#reading data\n",
        "data = pd.read_csv(\"data_regression.csv\")\n",
        "##The dimension of the data is seen, and the output column is checked to see whether it is continuous or discrete. \n",
        "##In this case, the output is discrete, so a classification algorithm should be applied.\n",
        "data = data.drop([\"year\", \"customer_id\", \"phone_no\"], axis=1)\n",
        "print(data.shape)         # Lookiing the shape of the data\n",
        "print(data.columns)       # Looking how many columns data has\n",
        "data.dtypes  \n",
        "data.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wHAGp4M10cUr",
        "outputId": "20095ecf-22fd-4a0a-d1ee-e125d303154b"
      },
      "outputs": [],
      "source": [
        "data.isnull().sum()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 270
        },
        "id": "qt8ctl6m0gm-",
        "outputId": "eb870fb2-a28d-4e0a-f6f7-55771f2bdaa7"
      },
      "outputs": [],
      "source": [
        "final_data = data.dropna()         # Dropping the null values\n",
        "final_data.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yL_bQ-mH0hzn",
        "outputId": "1537150a-c09c-4abb-b46c-082ee6bff1ce"
      },
      "outputs": [],
      "source": [
        "final_data[\"churn\"].value_counts()       \n",
        "# let us see how many data is there in each class for deciding the sampling data number"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TqetEXX50i7Z",
        "outputId": "632fbf54-5189-4d61-8d8f-fb1a6735aeb0"
      },
      "outputs": [],
      "source": [
        "data_majority = final_data[final_data['churn']==0] # class 0\n",
        "data_minority = final_data[final_data['churn']==1] # class 1\n",
        "# upsampling minority class\n",
        "data_minority_upsampled = resample(data_minority, replace=True, n_samples=900, random_state=123) \n",
        "# downsampling majority class\n",
        "data_majority_downsampled = resample(data_majority, replace=False, n_samples=900, random_state=123)\n",
        "# concanating both upsampled and downsampled class\n",
        "## Data Concatenation:  Concatenating the dataframe after upsampling and downsampling \n",
        "# concanating both upsampled and downsampled class\n",
        "data2 = pd.concat([data_majority_downsampled, data_minority_upsampled])\n",
        "## Encoding Catagoricals:  We need to encode the categorical variables before feeding it to the model\n",
        "data2[['gender', 'multi_screen', 'mail_subscribed']]\n",
        "# label encoding categorical variables\n",
        "label_encoder = preprocessing.LabelEncoder()\n",
        "data2['gender'] = label_encoder.fit_transform(data2['gender'])\n",
        "data2['multi_screen'] = label_encoder.fit_transform(data2['multi_screen'])\n",
        "data2['mail_subscribed'] = label_encoder.fit_transform(data2['mail_subscribed'])\n",
        "## Lets now check again the distribution of the oputut class after sampling\n",
        "data2[\"churn\"].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GUpLqdEw0tXb",
        "outputId": "041cd5a3-f9ec-447a-c0d9-f09e1e8bd4e0"
      },
      "outputs": [],
      "source": [
        "# indenpendent variable \n",
        "X = data2.iloc[:,:-1]\n",
        "## This X will be fed to the model to learn params \n",
        "#scaling the data\n",
        "sc = StandardScaler()         # Bringing the mean to 0 and variance to 1, so as to have a non-noisy optimization\n",
        "X = sc.fit_transform(X)\n",
        "X = sc.transform(X)\n",
        "## Keeping the output column in a separate dataframe\n",
        "data2 = data2.sample(frac=1).reset_index(drop=True) ## Shuffle the data frame and reset index\n",
        "n_samples, n_features = X.shape ## n_samples is the number of samples and n_features is the number of features\n",
        "#output column\n",
        "Y = data2[\"churn\"]\n",
        "#output column\n",
        "Y = data2[\"churn\"]\n",
        "##Data Splitting: \n",
        "## The data is processed, so now we can split the data into train and test to train the model with training data and test it later from testing data.\n",
        "#splitting data into train and test\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42, stratify = Y)\n",
        "print((y_train == 1).sum())\n",
        "print((y_train == 0).sum())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VzYgdjlU0tof",
        "outputId": "5e188583-3dcd-4ddc-8b1b-dee6bb7b524a"
      },
      "outputs": [],
      "source": [
        "print(type(X_train))\n",
        "print(type(X_test))\n",
        "print(type(y_train.values))\n",
        "print(type(y_test.values))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "y0raeQhA0u4z"
      },
      "outputs": [],
      "source": [
        "X_train = torch.from_numpy(X_train.astype(np.float32))\n",
        "X_test = torch.from_numpy(X_test.astype(np.float32))\n",
        "y_train = torch.from_numpy(y_train.values.astype(np.float32))\n",
        "y_test = torch.from_numpy(y_test.values.astype(np.float32))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TBb_XrZt00_p",
        "outputId": "4c44564c-a8c1-4071-9527-afebd5fa310f"
      },
      "outputs": [],
      "source": [
        "y_train.shape, y_test.shape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6-9CiBo40vxM"
      },
      "outputs": [],
      "source": [
        "y_train = y_train.view(y_train.shape[0], 1)\n",
        "y_test = y_test.view(y_test.shape[0], 1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "07XkWAA20w0U",
        "outputId": "63cb82b1-6650-442d-c911-cec1a0778e20"
      },
      "outputs": [],
      "source": [
        "y_train.shape, y_test.shape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fQxkh0FK02i5"
      },
      "outputs": [],
      "source": [
        "# logistic regression class\n",
        "class LogisticRegression(nn.Module):\n",
        "    def __init__(self, n_input_features):\n",
        "        super(LogisticRegression, self).__init__()\n",
        "        self.linear = nn.Linear(n_input_features, 1)\n",
        "    \n",
        "    #sigmoid transformation of the input \n",
        "    def forward(self, x):\n",
        "        y_pred = torch.sigmoid(self.linear(x))\n",
        "        return y_pred"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "C8GvUnxQ05Ik"
      },
      "outputs": [],
      "source": [
        "lr = LogisticRegression(n_features)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RdYvQs1a06JP"
      },
      "outputs": [],
      "source": [
        "num_epochs = 500\n",
        "# Traning the model for large number of epochs to see better results  \n",
        "learning_rate = 0.0001\n",
        "criterion = nn.BCELoss()                                \n",
        "# We are working on lgistic regression so using Binary Cross Entropy\n",
        "optimizer = torch.optim.SGD(lr.parameters(), lr=learning_rate)      "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qT5pK7jr0_Ez",
        "outputId": "abf0e908-173d-447f-f8bf-0ce55c9907e3"
      },
      "outputs": [],
      "source": [
        "for epoch in range(num_epochs):\n",
        "    y_pred = lr(X_train)\n",
        "    loss = criterion(y_pred, y_train)             \n",
        "    loss.backward()\n",
        "    optimizer.step()\n",
        "    optimizer.zero_grad()\n",
        "    if (epoch+1) % 20 == 0:                                         \n",
        "        # printing loss values on every 10 epochs to keep track\n",
        "        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KYDPNSBm1C_T",
        "outputId": "77c4070e-20de-4cb0-94b1-33962a36cce8"
      },
      "outputs": [],
      "source": [
        "with torch.no_grad():\n",
        "    y_predicted = lr(X_test)\n",
        "    y_predicted_cls = y_predicted.round()\n",
        "    acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])\n",
        "    print(f'accuracy: {acc.item():.4f}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0miFH7DO1oOq",
        "outputId": "38352b86-1590-490a-9f45-377ed543a3bc"
      },
      "outputs": [],
      "source": [
        "#classification report\n",
        "from sklearn.metrics import classification_report\n",
        "print(classification_report(y_test, y_predicted_cls))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BXKCNp_q2zhp",
        "outputId": "2fe7e571-64a6-4dc5-9be7-20d365a96a05"
      },
      "outputs": [],
      "source": [
        "#confusion matrix\n",
        "from sklearn.metrics import confusion_matrix\n",
        "confusion_matrix = confusion_matrix(y_test, y_predicted_cls)\n",
        "print(confusion_matrix)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "x6l2_Yxr21kT"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "name": "LogisticRegressionPyTorch_PythonCodeTutorial.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
